* Author: Joe Tatarka
* Name: 4_1_homebase_build.do
* Purpose: Take raw homebase data from 2018-2022 and combine into a single cleaned homebase dataset

* Set Global File Paths
global root = "T:/service_industries/replication_package"
global raw_root = "${root}/datasets/raw"
global intermediate_root = "${root}/datasets/intermediate"
global built_root = "${root}/datasets/built"
global exhibits_root = "${root}/exhibits"

global homebase_root = "T:/service_industries/data/datasets/raw/homebase"

*********************************************************************************
***** 1. Bring in and append Raw data
********************************************************************************

* [INSERT RAW HOMEBASE DATA PATH HERE]
insheet using "${homebase_root}/hours_worked_only_daily_raw_data_files/2021-07/20210701_hour_worked_daily_file.csv0000_000", clear

**** Combine July 2021 - January 2023 Raw Data 
**** They are daily files, but there are duplicates observations as the daily files overlap

local m = 1
quietly foreach month in "2021-07" "2021-08" "2021-09" "2021-10" "2021-11" "2021-12" "2022-01" "2022-02" "2022-03" "2022-04" "2022-05" "2022-06" "2022-07" "2022-08" "2022-09" "2022-10" "2022-11" "2022-12" "2023-01" {

	local files : dir "${homebase_root}/hours_worked_only_daily_raw_data_files/`month'" files "*"
	local i = 1
	foreach file in `files' {
		insheet using "${homebase_root}/hours_worked_only_daily_raw_data_files/`month'/`file'", clear
		
		drop msa industry business_type 

		replace location_created_date =substr(location_created_date,1,10)
		replace job_created_date = substr(job_created_date,1,10)
		replace job_archived_date = "" if job_archived_date == "\N"
		replace job_archived_date = substr(job_archived_date,1,10)

		gen emp_level = 1 if level == "Employee"
		replace emp_level = 2 if level == "Manager"
		replace emp_level = 3 if level == "General Manager"

		label define emp_level 1 "Employee" 2 "Manager" 3 "General Manager", replace 

		label values emp_level emp_level
		drop level 

		foreach var in event_date job_archived_date job_created_date location_created_date{
			gen `var'_1 = date(`var', "YMD")
			drop `var' 
			rename `var'_1 `var'
			format `var' %td
		}

		destring avg_hourly_wage_rate total_wages_earned naics_code, replace force 

		local file_substr = substr("`file'",1,8)
		gen file_date = date("`file_substr'", "YMD")
		format file_date %td

		compress

		order event_date, first 

		tempfile homebase_`i'
		save `homebase_`i'', replace 

		local k = `i'
		local i = `i' + 1
	}


	use `homebase_1', clear 
	foreach j of numlist 2/`k' {
		append using `homebase_`j''
		erase `homebase_`j''
	}
	erase `homebase_1'

* if there are duplicates, keep duplicates with most recent file date 
	duplicates tag event_date user_id location_id avg_hourly_wage_rate total_wages_earned hours_worked, gen(dup)

	bys event_date user_id location_id avg_hourly_wage_rate total_wages_earned hours_worked: egen max_file_date = max(file_date)
	replace dup = 0 if file_date == max_file_date
	drop if dup > 0
	drop dup max_file_date
	
	*** save monthly data from July 2021 - January 2023
	tempfile data_`m' 
	save `data_`m'', replace
	local m = `m' + 1
}

*** Combine in Jan 2018 - June 2021 Data and clean the data
*** These files are not daily

local files : dir "${homebase_root}/hours_worked_dataset_historical_files" files "*"
local i = 1
foreach file in `files' {
import delimited "${homebase_root}/hours_worked_dataset_historical_files/`file'", clear

drop msa industry business_type 

replace location_created_date =substr(location_created_date,1,10)
replace job_created_date = substr(job_created_date,1,10)
replace job_archived_date = "" if job_archived_date == "\N"
replace job_archived_date = substr(job_archived_date,1,10)

gen emp_level = 1 if level == "Employee"
replace emp_level = 2 if level == "Manager"
replace emp_level = 3 if level == "General Manager"

label define emp_level 1 "Employee" 2 "Manager" 3 "General Manager", replace 

label values emp_level emp_level
drop level 

foreach var in event_date job_archived_date job_created_date location_created_date{
gen `var'_1 = date(`var', "YMD")
drop `var' 
rename `var'_1 `var'
format `var' %td
}

destring avg_hourly_wage_rate total_wages_earned naics_code, replace force 

tostring county_code, replace
replace county_code = "" if county_code == "."
replace county_code = "0" + county_code if strlen(county_code) < 5 & county_code != ""

gen file_date = .

compress

order event_date, first 

tempfile homebase_`i'
save `homebase_`i'', replace 

local i = `i' + 1
}

use `homebase_1', clear 
erase `homebase_1'
foreach j of numlist 2/10 {
	append using `homebase_`j''
	erase `homebase_`j''
}


**** Now append data for months from July 2021 - Jan 2023 
foreach month of numlist 1/19{
	append using `data_`month''
	erase `data_`month''
}

**********************************************************************
**** 2. Clean the Appended data
**********************************************************************

*** drop duplicates 
bys event_date user_id location_id: egen max_file_date = max(file_date)
duplicates tag  event_date user_id location_id, gen(dup)
replace dup = 0 if file_date == max_file_date

drop if dup > 0
drop dup max_file_date

bys user_id event_date location_id: egen max_archive_date = max(job_archived_date)
duplicates tag  event_date user_id location_id, gen(dup)
replace dup = 0 if job_archived_date == max_archive_date
drop if dup > 0
drop dup max_archive_date

bys user_id event_date location_id: egen max_wage = max(total_wages_earned)
duplicates tag  event_date user_id location_id, gen(dup)
replace dup = 0 if total_wages_earned == max_wage
drop if dup > 0
drop dup max_wage

** Keep from 2018-2022
keep if event_date < date("2023-1-1", "YMD")

*** Only Keep Limited Service Restaurants
keep if inlist(naics_code, 722513, 722514, 722515) 

*** Drop if not in the USA or if state variable is missing
drop if state == "Not USA"
replace state = "" if state == "Unclassified"

* drop if county_code is missing or if establishment is in puerto rico
drop if county_code == ""
drop if state == "pr"

* drop if no hours worked or missing hours worked
drop if hours_worked == 0 | hours_worked == .

*** Create some key variables 
gen year_month_gs = ym(year(event_date), month(event_date))
format year_month_gs %tm

gen shift_less_4 = (hours_worked <4)

gen less_4_hours = hours_worked if hours_worked <4

gen agg_hours = hours_worked

* Collapse to Monthly Level 
collapse (mean) hours_worked shift_less_4 (sum) agg_hours less_4_hours, by(year_month_gs)

************************************************************
*** 3. Save Intermediate Homebase Dataset 
************************************************************
save "${built_root}/homebase_build.dta", replace

